Imports and functions definitions
import pandas as pd
import plotly.express as px
import numpy as np
from datetime import datetime as dt
from matplotlib import pyplot as plt
import re
from matplotlib.ticker import EngFormatter
%matplotlib inline
#Adjust your font-size here
plt.rcParams.update({'font.size': 18})
#convert date to YYYY-MM-DD hh:mm:ss format
def adjustDateFormat(date):
return re.sub(r'(\d{1,2})-(\d{1,2})-(\d{4})', '\\3-\\2-\\1', date)
#creates a function to remove all @'s, hashtags, and links
#Then applies it to the dataframe
def cleanUpTweetsAndComments(txt):
# Remove mentions
txt = re.sub(r'@[A-Za-z0-9_]+', '', txt, flags=re.IGNORECASE)
# Remove hashtags
txt = re.sub(r'#', '', txt)
txt = re.sub(r'&', '', txt, flags=re.IGNORECASE)
# Remove retweets:
txt = re.sub(r'RT : ', '', txt)
# Remove urls
txt = re.sub(r'http\S+', '', txt, flags=re.IGNORECASE)
#removes stop words
txt = re.sub(r'the' , '', txt, flags=re.IGNORECASE)
txt = re.sub(r'and' , '', txt, flags=re.IGNORECASE)
txt = re.sub(r'to ', '', txt, flags=re.IGNORECASE)
txt =re.sub(r'covid','',txt, flags=re.IGNORECASE)
txt =re.sub(r'vaccine','',txt, flags=re.IGNORECASE)
txt =re.sub(r'i\'m','',txt, flags=re.IGNORECASE)
txt =re.sub('[0-9]','',txt, flags=re.IGNORECASE)
#removes non-ascci characters
txt=re.sub(r'[^\x00-\x7f]','', txt)
return txt
#Importing dataframes
#Avaliable on https://www.kaggle.com/gpreda/all-covid19-vaccines-tweets
twitter_df=pd.read_csv('datasets/vaccination_all_tweets.csv', low_memory=False)
#Avaliable on https://www.kaggle.com/xhlulu/covid19-vaccine-news-reddit-discussions
reddit_df=pd.read_csv('datasets/reddit_comments.csv')
#Avaliable on https://www.kaggle.com/gpreda/pfizer-vaccine-tweets
twitter_pfizer_df=pd.read_csv('datasets/pfizer_biontech_tweets.csv')
#Avaliable on https://www.kaggle.com/gpreda/pfizer-vaccine-on-reddit
reddit_pfizer_df=pd.read_csv('datasets/reddit_pfizer_vaccine.csv')
Evolution of tweets about COVID-19 vaccines used in entire world
#twitter_df=pd.read_csv('datasets/covidvaccine_hashtag.csv', low_memory=False)
#Avaliable in https://www.kaggle.com/gpreda/all-covid19-vaccines-tweets
twitter_df=pd.read_csv('datasets/vaccination_all_tweets.csv', low_memory=False)
twitter_df['date']=twitter_df['date'].dropna().apply(adjustDateFormat)
twitter_data=pd.to_datetime(twitter_df['date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df = (pd.to_datetime(twitter_data.dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
#df=df.groupby(pd.Grouper(key='date',freq='1M')).sum().reset_index()
df=df.sort_values(by='date')
#Defining engineering notation for Y-Axis
formatter = EngFormatter()
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(formatter)
plt.plot(df['date'],df['count'], c='orange')
plt.xlabel('Date')
plt.ylabel('Number of interactions')
plt.title('Evolution of tweets about COVID-19 vaccines used in entire world')
plt.show()
Evolution of discussions about COVID-19 vaccination on Reddit
reddit_df['post_date']=reddit_df['post_date'].dropna().apply(adjustDateFormat)
reddit_data=pd.to_datetime(reddit_df['post_date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df = (pd.to_datetime(reddit_data.dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
df=df.sort_values(by='date')
#Defining engineering notation for Y-Axis
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter)
plt.xticks(rotation=45)
plt.plot(df['date'],df['count'], c='orange')
plt.xlabel('Date')
plt.ylabel('Number of posts')
plt.title('Evolution of discussions about COVID-19 vaccination on Reddit')
plt.show()
Generating word cloud of tweets about COVID-19 vaccines
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
stopwords = set(STOPWORDS)
for i in open('stopwords/twitter-stopwords.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/twitter-stopwords - TA.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/common-english-verbs.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/common-english-prep-conj.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/common-english-words.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/smart-common-words.txt', "r").read().split(','):
stopwords.add(i)
for i in open('stopwords/stopWords.txt', "r").read().split('\n'):
stopwords.add(i)
for i in open('stopwords/punctuation.txt', "r").read().split('\n'):
stopwords.add(i)
stopwords.add("00A0")
stopwords.add("00BD")
stopwords.add("00B8")
stopwords.add("t")
stopwords.add("co")
stopwords.add("going")
stopwords.add("ed")
stopwords.add("covidvaccine")
stopwords.add("covid")
stopwords.add("corona")
stopwords.add("bot")
stopwords.add("covid19")
stopwords.add("covid-19")
stopwords.add("coronavirusvaccine")
stopwords.add("coronavaccine")
stopwords.add("got")
stopwords.add("will")
stopwords.add("day")
stopwords.add("re")
stopwords.add("coronavirus")
stopwords.add("new")
stopwords.add("now")
stopwords.add("one")
stopwords.add("vaccine")
stopwords.add("covidvaccination")
stopwords.add("vaccination")
stopwords.add("vaccinated")
stopwords.add("vaccinations")
stopwords.add("vaccinate")
stopwords.add("vaccines")
stopwords.add("s")
stopwords.add("m")
stopwords.add("ir")
stopwords.add("pemic")
stopwords.add("covid_19")
stopwords.add("virus")
stopwords.add("dose")
stopwords.add("doses")
wordcloud = WordCloud(background_color="white",stopwords=stopwords,random_state = 2016).generate(" ".join([i for i in twitter_df['text'].dropna().apply(cleanUpTweetsAndComments).str.upper()]))
plt.figure(figsize=(10,4))
plt.imshow(wordcloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Generating word cloud of Reddit users about COVID-19 vaccines
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import re
stopwords_reddit = set(STOPWORDS)
for i in open('stopwords/twitter-stopwords.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/twitter-stopwords - TA.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/common-english-verbs.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/common-english-prep-conj.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/common-english-words.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/smart-common-words.txt', "r").read().split(','):
stopwords_reddit.add(i)
for i in open('stopwords/stopWords.txt', "r").read().split('\n'):
stopwords_reddit.add(i)
for i in open('stopwords/punctuation.txt', "r").read().split('\n'):
stopwords_reddit.add(i)
stopwords_reddit.add("00A0")
stopwords_reddit.add("00BD")
stopwords_reddit.add("00B8")
stopwords_reddit.add("t")
stopwords_reddit.add("co")
stopwords_reddit.add("going")
stopwords_reddit.add("ed")
stopwords_reddit.add("covidvaccine")
stopwords_reddit.add("covid")
stopwords_reddit.add("corona")
stopwords_reddit.add("bot")
stopwords_reddit.add("covid19")
stopwords_reddit.add("coronavirusvaccine")
stopwords_reddit.add("coronavaccine")
stopwords_reddit.add("got")
stopwords_reddit.add("will")
stopwords_reddit.add("day")
stopwords_reddit.add("re")
stopwords_reddit.add("coronavirus")
stopwords_reddit.add("new")
stopwords_reddit.add("now")
stopwords_reddit.add("one")
stopwords_reddit.add("vaccine")
stopwords_reddit.add("covidvaccination")
stopwords_reddit.add("vaccination")
stopwords_reddit.add("vaccinated")
stopwords_reddit.add("vaccinations")
stopwords_reddit.add("vaccinate")
stopwords_reddit.add("vaccines")
stopwords_reddit.add("s")
stopwords_reddit.add("m")
stopwords_reddit.add("ir")
stopwords_reddit.add("pemic")
stopwords_reddit.add("covid_19")
stopwords_reddit.add("virus")
stopwords_reddit.add('%')
stopwords_reddit.add('[removed]')
stopwords_reddit.add('se')
stopwords_reddit.add('it.')
stopwords_reddit.add("y're")
stopwords_reddit.add("dose")
stopwords_reddit.add("doses")
wordcloud_reddit = WordCloud(background_color="white", stopwords=stopwords_reddit,random_state = 2016).generate(" ".join([i for i in reddit_df['comment_body'].dropna().apply(cleanUpTweetsAndComments).str.upper()]))
plt.figure(figsize=(10,8))
plt.imshow(wordcloud_reddit)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Performing most common tweeted' words analysis
import collections
import re
import matplotlib.cm as cm
from matplotlib import rcParams
all_headlines = ' '.join(twitter_df['text'].dropna().apply(cleanUpTweetsAndComments).str.lower())
filtered_words = [word for word in all_headlines.split() if word not in stopwords]
counted_words = collections.Counter(filtered_words)
top_words_twitter = []
counts = []
for letter, count in list(reversed(counted_words.most_common(10))):
top_words_twitter.append(letter)
counts.append(count)
colors = cm.Wistia(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10
plt.title('Top words from tweets about vaccines for COVID-19 vs. their count')
formatter = EngFormatter()
plt.gca().xaxis.set_major_formatter(formatter)
plt.xlabel('Count')
plt.ylabel('Words')
#Uncomment this to see Horizontal bargraph
#plt.barh(top_words_twitter, counts, color=colors)
plt.plot(counts, top_words_twitter, '|', markersize=30, markeredgewidth=3 ,c='#2604CF')
plt.show()
Performing most common words analysis on Reddit
import collections
import re
import matplotlib.cm as cm
from matplotlib import rcParams
all_headlines = ' '.join(reddit_df['comment_body'].dropna().apply(cleanUpTweetsAndComments).str.lower())
filtered_words = [word for word in all_headlines.split() if word not in stopwords_reddit]
counted_words = collections.Counter(filtered_words)
top_words_reddit = []
counts = []
for letter, count in list(reversed(counted_words.most_common(10))):
top_words_reddit.append(letter)
counts.append(count)
colors = cm.Wistia(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10
plt.title('Top words about COVID-19 vaccination on Reddit vs. their count')
plt.gca().xaxis.set_major_formatter(formatter)
plt.xlabel('Count')
plt.ylabel('Words')
#Uncomment this to see Horizontal bargraph
#plt.barh(top_words_reddit, counts, color=colors)
plt.plot(counts, top_words_reddit, '|', markersize=30, markeredgewidth=3 ,c='#2604CF')
plt.show()
Top words evolution in Tweets
colors=['#4D6D3E', '#9E7824', '#BD92E1','#A22AC0', '#E59A13', '#2640A6', '#229287', '#C936A0', '#BD8377', '#B38D75']
cont=0
for word in top_words_twitter:
regex=r'(?<![^\W_])'+word+'(?![^\W_])'
top_word_df=twitter_df[twitter_df['text'].dropna().str.contains(regex, case=False)]
top_word_df = (pd.to_datetime(top_word_df['date'].dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
top_word_df=top_word_df.sort_values(by='date')
plt.plot(top_word_df['date'], top_word_df['count'], label=word, color=colors[cont])
cont+=1
plt.legend()
#Defining engineering notation for Y-Axis
formatter = EngFormatter()
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(formatter)
plt.xlabel('Date')
plt.ylabel('Number of tweets containing the word')
plt.show()
Top words evolution in Reddit posts
colors=['#4D6D3E', '#9E7824', '#BD92E1','#A22AC0', '#E59A13', '#2640A6', '#229287', '#C936A0', '#BD8377', '#B38D75']
cont=0
for word in top_words_reddit:
regex=r'(?<![^\W_])'+word+'(?![^\W_])'
top_word_df=reddit_df[reddit_df['comment_body'].dropna().str.contains(regex, case=False)]
top_word_df = (pd.to_datetime(top_word_df['post_date'].dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
top_word_df=top_word_df.sort_values(by='date')
plt.plot(top_word_df['date'], top_word_df['count'], label=word, color=colors[cont])
cont+=1
plt.legend()
#Defining engineering notation for Y-Axis
formatter = EngFormatter()
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(formatter)
plt.xlabel('Date')
plt.ylabel('Number of posts containing the word')
plt.show()
Reddit sentiment analysis about COVID-19 vaccines
from textblob import TextBlob
reddit_sentimental_analysis_df=pd.DataFrame()
#creates a function that determines subjectivity and polarity from publications. Avaliable in https://towardsdatascience.com/sentiment-analysis-evaluating-the-publics-perception-of-the-covid19-vaccine-bef564591078
def getTextSubjectivity(txt):
return TextBlob(txt).sentiment.subjectivity
def getTextPolarity(txt):
return TextBlob(txt).sentiment.polarity #applies these functions to the dataframe
def getTextAnalysis(a):
if a < 0:
return "Negative"
elif a == 0:
return "Neutral"
else:
return "Positive"
#creates another column called Score and applies the function to the dataframe
reddit_comments=reddit_df['comment_body'].dropna().apply(cleanUpTweetsAndComments)
reddit_sentimental_analysis_df['Subjectivity'] = reddit_comments.apply(getTextSubjectivity)
reddit_sentimental_analysis_df['Polarity'] = reddit_comments.apply(getTextPolarity) #builds a function to calculate and categorize each tweet as Negative, Neutral, and Positive
reddit_sentimental_analysis_df['Score'] = reddit_sentimental_analysis_df['Polarity'].apply(getTextAnalysis)
labels = reddit_sentimental_analysis_df.groupby('Score').count().index.values
values = reddit_sentimental_analysis_df.groupby('Score').size().values
plt.bar(labels, values, color = ['tab:olive', 'tab:orange', 'tab:purple'])
#plt.title(label = "Vaccine Sentiment Analysis - Reddit", fontsize = '15')
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylabel('Number of posts')
#calculates percentage of positive, negative, and neutral tweets
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Positive']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of positive posts")
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Neutral']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of neutral posts")
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Negative']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of negative posts")
52.24344224574321 % of positive posts 28.23573400828348 % of neutral posts 19.52082374597331 % of negative posts
Twitter sentiment analysis about COVID-19 vaccine
from textblob import TextBlob
twitter_sentimental_analysis_df=pd.DataFrame()
#creates a function that determines subjectivity and polarity from publications. Avaliable in https://towardsdatascience.com/sentiment-analysis-evaluating-the-publics-perception-of-the-covid19-vaccine-bef564591078
def getTextSubjectivity(txt):
return TextBlob(txt).sentiment.subjectivity
def getTextPolarity(txt):
return TextBlob(txt).sentiment.polarity #applies these functions to the dataframe
def getTextAnalysis(a):
if a < 0:
return "Negative"
elif a == 0:
return "Neutral"
else:
return "Positive"
#creates another column called Score and applies the function to the dataframe
tweets=twitter_df['text'].dropna().apply(cleanUpTweetsAndComments)
twitter_sentimental_analysis_df['Subjectivity'] = tweets.apply(getTextSubjectivity)
twitter_sentimental_analysis_df['Polarity'] = tweets.apply(getTextPolarity) #builds a function to calculate and categorize each tweet as Negative, Neutral, and Positive
twitter_sentimental_analysis_df['Score'] = twitter_sentimental_analysis_df['Polarity'].apply(getTextAnalysis)
labels = twitter_sentimental_analysis_df.groupby('Score').count().index.values
values = twitter_sentimental_analysis_df.groupby('Score').size().values
plt.bar(labels, values, color = ['tab:olive', 'tab:orange', 'tab:purple'])
#plt.title(label = "Vaccine Sentiment Analysis - Twitter", fontsize = '15')
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylabel('Number of tweets')
#calculates percentage of positive, negative, and neutral tweets
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Positive']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of positive tweets")
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Neutral']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of neutral tweets")
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Negative']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of negative tweets")
41.10243271479031 % of positive tweets 46.168515662570684 % of neutral tweets 12.729051622639007 % of negative tweets
Pfizer/BioNTech Vaccine Sentiment Analysis - Twitter
from textblob import TextBlob
twitter_sentimental_analysis_df=pd.DataFrame()
#creates a function that determines subjectivity and polarity from publications. Avaliable in https://towardsdatascience.com/sentiment-analysis-evaluating-the-publics-perception-of-the-covid19-vaccine-bef564591078
def getTextSubjectivity(txt):
return TextBlob(txt).sentiment.subjectivity
def getTextPolarity(txt):
return TextBlob(txt).sentiment.polarity #applies these functions to the dataframe
def getTextAnalysis(a):
if a < 0:
return "Negative"
elif a == 0:
return "Neutral"
else:
return "Positive"
#creates another column called Score and applies the function to the dataframe
tweets=twitter_pfizer_df['text'].dropna().apply(cleanUpTweetsAndComments)
twitter_sentimental_analysis_df['Subjectivity'] = tweets.apply(getTextSubjectivity)
twitter_sentimental_analysis_df['Polarity'] = tweets.apply(getTextPolarity) #builds a function to calculate and categorize each tweet as Negative, Neutral, and Positive
twitter_sentimental_analysis_df['Score'] = twitter_sentimental_analysis_df['Polarity'].apply(getTextAnalysis)
labels = twitter_sentimental_analysis_df.groupby('Score').count().index.values
values = twitter_sentimental_analysis_df.groupby('Score').size().values
plt.bar(labels, values, color = ['tab:olive', 'tab:orange', 'tab:purple'])
#plt.title(label = "Pfizer/BioNTech Vaccine Sentiment Analysis - Twitter", fontsize = '15')
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylabel('Number of tweets')
#calculates percentage of positive, negative, and neutral tweets
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Positive']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of positive tweets")
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Neutral']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of neutral tweets")
positive = twitter_sentimental_analysis_df[twitter_sentimental_analysis_df['Score'] == 'Negative']
print(str(positive.shape[0]/(twitter_sentimental_analysis_df.shape[0])*100) + " % of negative tweets")
44.63574457150199 % of positive tweets 45.11113966336888 % of neutral tweets 10.253115765129127 % of negative tweets
Pfizer/BioNTech Vaccine Sentiment Analysis - Posts on Reddit
from textblob import TextBlob
reddit_sentimental_analysis_df=pd.DataFrame()
#creates a function that determines subjectivity and polarity from publications. Avaliable in https://towardsdatascience.com/sentiment-analysis-evaluating-the-publics-perception-of-the-covid19-vaccine-bef564591078
def getTextSubjectivity(txt):
return TextBlob(txt).sentiment.subjectivity
def getTextPolarity(txt):
return TextBlob(txt).sentiment.polarity #applies these functions to the dataframe
def getTextAnalysis(a):
if a < 0:
return "Negative"
elif a == 0:
return "Neutral"
else:
return "Positive"
#creates another column called Score and applies the function to the dataframe
reddit_comments=reddit_pfizer_df['body'].dropna().apply(cleanUpTweetsAndComments)
reddit_sentimental_analysis_df['Subjectivity'] = reddit_comments.apply(getTextSubjectivity)
reddit_sentimental_analysis_df['Polarity'] = reddit_comments.apply(getTextPolarity) #builds a function to calculate and categorize each tweet as Negative, Neutral, and Positive
reddit_sentimental_analysis_df['Score'] = reddit_sentimental_analysis_df['Polarity'].apply(getTextAnalysis)
labels = reddit_sentimental_analysis_df.groupby('Score').count().index.values
values = reddit_sentimental_analysis_df.groupby('Score').size().values
formatter = EngFormatter()
plt.gca().yaxis.set_major_formatter(formatter)
plt.bar(labels, values, color = ['tab:olive', 'tab:orange', 'tab:purple'])
#plt.title(label = "Pfizer/BioNTech Vaccine Sentiment Analysis - Posts on Reddit", fontsize = '15')
plt.ylabel('Number of posts')
#calculates percentage of positive, negative, and neutral tweets
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Positive']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of positive posts")
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Neutral']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of neutral posts")
positive = reddit_sentimental_analysis_df[reddit_sentimental_analysis_df['Score'] == 'Negative']
print(str(positive.shape[0]/(reddit_sentimental_analysis_df.shape[0])*100) + " % of negative posts")
57.669172932330824 % of positive posts 17.518796992481203 % of neutral posts 24.81203007518797 % of negative posts
#Avaliable in https://www.kaggle.com/gpreda/all-covid19-vaccines-tweets
twitter_df=pd.read_csv('datasets/vaccination_all_tweets.csv', low_memory=False)
twitter_df['date']=twitter_df['date'].dropna().apply(adjustDateFormat)
twitter_data=pd.to_datetime(twitter_df['date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df = (pd.to_datetime(twitter_data.dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
df=df.sort_values(by='date')
fig = px.line(df,x="date", y="count", hover_name="date",
labels={
"date": "Date",
"count": "Number of interactions"
})
fig.update_xaxes(tickangle=45)
formatter = EngFormatter()
fig.update_layout(yaxis_tickformat = 's')
fig.update_layout(
xaxis_tickformat = '%d %B<br>%Y'
)
fig.show()
Evolution of discussions about COVID-19 vaccination on Reddit
reddit_df['post_date']=reddit_df['post_date'].dropna().apply(adjustDateFormat)
reddit_data=pd.to_datetime(reddit_df['post_date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df = (pd.to_datetime(reddit_data.dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
df=df.sort_values(by='date')
fig = px.line(df,x="date", y="count", hover_name="date",
labels={
"date": "Date",
"count": "Number of interactions"
})
fig.update_xaxes(tickangle=45)
formatter = EngFormatter()
fig.update_layout(yaxis_tickformat = 's')
fig.update_layout(
xaxis_tickformat = '%d %B<br>%Y'
)
fig.show()
Top words evolution in Tweets
data=list()
for word in top_words_twitter:
regex=r'(?<![^\W_])'+word+'(?![^\W_])'
temp=twitter_df[twitter_df['text'].dropna().str.contains(regex, case=False)]
temp = (pd.to_datetime(temp['date'].dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
temp=temp.sort_values(by='date')
temp['top_word']=word
data.extend(temp.values.tolist())
df = pd.DataFrame(data, columns=['date','count','top_word'])
fig = px.line(df,x="date", y="count", hover_name="date", color='top_word',
labels={
"date": "Date",
"count": "Number of tweets containing the word",
"top_word": "Top Word"
})
fig.update_xaxes(tickangle=45)
fig.update_layout(yaxis_tickformat = 's')
fig.update_layout(
xaxis_tickformat = '%d %B<br>%Y'
)
fig.show()
Top words evolution in Reddit posts
data=list()
for word in top_words_reddit:
regex=r'(?<![^\W_])'+word+'(?![^\W_])'
temp=reddit_df[reddit_df['comment_body'].dropna().str.contains(regex, case=False)]
temp = (pd.to_datetime(temp['post_date'].dropna())
.dt.floor('d')
.value_counts()
.rename_axis('date')
.reset_index(name='count'))
temp=temp.sort_values(by='date')
temp['top_word']=word
data.extend(temp.values.tolist())
df = pd.DataFrame(data, columns=['date','count','top_word'])
fig = px.line(df,x="date", y="count", hover_name="date", color='top_word',
labels={
"date": "Date",
"count": "Number of posts containing the word",
"top_word": "Top Word"
})
fig.update_xaxes(tickangle=45)
fig.update_layout(yaxis_tickformat = 's')
fig.update_layout(
xaxis_tickformat = '%d %B<br>%Y'
)
fig.show()
Performing most common tweeted' words analysis
import collections
import re
import matplotlib.cm as cm
from matplotlib import rcParams
all_headlines = ' '.join(twitter_df['text'].dropna().apply(cleanUpTweetsAndComments).str.lower())
filtered_words = [word for word in all_headlines.split() if word not in stopwords]
counted_words = collections.Counter(filtered_words)
top_words_twitter = []
counts = []
for letter, count in list(reversed(counted_words.most_common(10))):
top_words_twitter.append(letter)
counts.append(count)
data=pd.DataFrame(data=top_words_twitter, columns=['top_word'])
data['count']=counts
fig = px.bar(data, x="count", y="top_word", orientation='h',
labels={
"count": "Count",
"top_word": "Word"
})
fig.show()
Performing most common words analysis on Reddit
import collections
import re
import matplotlib.cm as cm
from matplotlib import rcParams
all_headlines = ' '.join(reddit_df['comment_body'].dropna().apply(cleanUpTweetsAndComments).str.lower())
filtered_words = [word for word in all_headlines.split() if word not in stopwords_reddit]
counted_words = collections.Counter(filtered_words)
top_words_reddit = []
counts = []
for letter, count in list(reversed(counted_words.most_common(10))):
top_words_reddit.append(letter)
counts.append(count)
colors = cm.Wistia(np.linspace(0, 1, 10))
data=pd.DataFrame(data=top_words_twitter, columns=['top_word'])
data['count']=counts
fig = px.bar(data, x="count", y="top_word", orientation='h',
labels={
"count": "Count",
"top_word": "Word"
})
fig.update_layout(xaxis_tickformat = 's')
fig.show()